One of the key operational challenges in bike-share system is to maintain a balanced distribution across stations and times to meet user demands. Some stations frequently experience bike shortages where the users cannot get a bike from the docks when they need them. However, some stations frequently experience surpluses where the users cannot return the bikes to the docks when they need them. This rebalance issues become extremely serious during the peak hours due to assymetric commuting patterns. Without timely rebalancing, users may find it difficult to rent or return bikes on timly manners, leading to user dissatisfaction and reduced ridership. Efficient reblancing ahead of user demands ensures the system’s availability and reliability, which also involves real-time data analysis and prediction.
To address this issue, we propose a data-driven approach to predict the bike demand across each stations and time intervals. By analyzing historical data, we can identify patterns and trends in bike usage, allowing us to make informed decisions about where and when to redistribute bikes. This approach not only improves the efficiency of the bike-share system but also enhances the overall user experience by ensuring that bikes are available when and where they are needed. Predictions will be made several hours in advance using the models that incorporate features such as time of day, day of week, historical usage patterns, weather conditions, and current station capacity. By leveraging these data sources, we can optimize the distribution of bikes across the system, reducing the likelihood of shortages and surpluses.
The model will be focused rebalance efforts on the stations in Philadelphia, PA. The model will be trained using the historical bike share data for the month of May, 2022. The model will be used to predict the bike demand for the next 24 hours, and the results will be used to inform rebalancing efforts.
The bike share data is obtained from the Indego. The data includes information on bike trips, including start and end times, start and end stations, and bike IDs. The data is cleaned and preprocessed to remove any duplicates or missing values. The data is then aggregated to hourly intervals to create a time series of bike demand for each station.
bike<-read.csv("data/indego-trips-2022-q2.csv")
may <- bike %>%
mutate(start_time = mdy_hm(start_time)) %>%
filter(month(start_time) == 5) %>%
mutate(interval60 = floor_date(ymd_hms(start_time), unit = "hour"),
interval15 = floor_date(ymd_hms(start_time), unit = "15 mins"),
week = week(interval60),
dotw = wday(interval60, label=TRUE))
may<-may%>%
filter(!is.na(ymd_hms(start_time)))philCensus <-
get_acs(geography = "tract",
variables = c("B01003_001", "B19013_001",
"B02001_002", "B08013_001",
"B08012_001", "B08301_001",
"B08301_010", "B01002_001"),
year = 2022,
state = "PA",
geometry = TRUE,
county=c("Philadelphia"),
output = "wide") %>%
rename(Total_Pop = B01003_001E,
Med_Inc = B19013_001E,
Med_Age = B01002_001E,
White_Pop = B02001_002E,
Travel_Time = B08013_001E,
Num_Commuters = B08012_001E,
Means_of_Transport = B08301_001E,
Total_Public_Trans = B08301_010E) %>%
select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
Means_of_Transport, Total_Public_Trans,
Med_Age,
GEOID, geometry) %>%
mutate(Percent_White = White_Pop / Total_Pop,
Mean_Commute_Time = Travel_Time / Total_Public_Trans,
Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport)## Getting data from the 2018-2022 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
philTracts <-
philCensus %>%
as.data.frame() %>%
distinct(GEOID, .keep_all = TRUE) %>%
select(GEOID, geometry) %>%
st_sfdat_census <- st_join(may %>%
filter(is.na(start_lon) == FALSE &
is.na(start_lat) == FALSE &
is.na(end_lon) == FALSE &
is.na(end_lat) == FALSE) %>%
st_as_sf(., coords = c("start_lon", "start_lat"), crs = 4326),
philTracts %>%
st_transform(crs=4326),
join=st_intersects,
left = TRUE) %>%
rename(Origin.Tract = GEOID) %>%
mutate(start_lon = unlist(map(geometry, 1)),
start_lat = unlist(map(geometry, 2)))%>%
as.data.frame() %>%
select(-geometry)%>%
st_as_sf(., coords = c("end_lon", "end_lat"), crs = 4326) %>%
st_join(., philTracts %>%
st_transform(crs=4326),
join=st_intersects,
left = TRUE) %>%
rename(Destination.Tract = GEOID) %>%
mutate(to_longitude = unlist(map(geometry, 1)),
to_latitude = unlist(map(geometry, 2)))%>%
as.data.frame() %>%
select(-geometry)ggplot(dat_census %>%
group_by(interval60) %>%
tally())+
geom_line(aes(x = interval60, y = n))+
labs(title="Bike share trips per hr. Philadelphia, May, 2022",
x="Date",
y="Number of trips")+
plotThemedat_census %>%
mutate(time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
group_by(interval60, start_station, time_of_day) %>%
tally()%>%
group_by(start_station, time_of_day)%>%
summarize(mean_trips = mean(n))%>%
ggplot()+
geom_histogram(aes(mean_trips), binwidth = 1)+
labs(title="Mean Number of Hourly Trips Per Station. Philadelphia, May, 2022",
x="Number of trips",
y="Frequency")+
facet_wrap(~time_of_day)+
plotThemeggplot(dat_census %>%
group_by(interval60, start_station) %>%
tally())+
geom_histogram(aes(n), binwidth = 5)+
labs(title="Bike share trips per hr by station. Chicago, May, 2018",
x="Trip Counts",
y="Number of Stations")+
plotThemeggplot(dat_census %>% mutate(hour = hour(start_time)))+
geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
labs(title="Bike share trips in Chicago, by day of the week, May, 2018",
x="Hour",
y="Trip Counts")+
plotThemeggplot(dat_census %>%
mutate(hour = hour(start_time),
weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday")))+
geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
labs(title="Bike share trips in Chicago - weekend vs weekday, May, 2018",
x="Hour",
y="Trip Counts")+
plotThemeggplot()+
geom_sf(data = philTracts %>%
st_transform(crs=4326))+
geom_point(data = dat_census %>%
mutate(hour = hour(start_time),
weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
group_by(start_station, start_lat, start_lon, weekend, time_of_day) %>%
tally(),
aes(x=start_lon, y = start_lat, color = n),
fill = "transparent", alpha = 0.4, size = 0.3)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$start_lat), max(dat_census$start_lat))+
xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
facet_grid(weekend ~ time_of_day)+
labs(title="Bike share trips per hr by station. Philadelphia, May, 2022")+
mapThemeThe weather data is obtained from the National Weather Service (Philadelphia International Airport). The data includes information on temperature, precipitation, and wind speed. The data is cleaned and preprocessed to remove any duplicates or missing values. The data is then aggregated to hourly intervals to create a time series of weather conditions for each station.
weather.Panel <-
riem_measures(station = "PHL", date_start = "2022-05-01", date_end = "2022-05-31") %>%
dplyr::select(valid, tmpf, p01i, sknt)%>%
replace(is.na(.), 0) %>%
mutate(interval60 = ymd_h(substr(valid,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60) %>%
summarize(Temperature = max(tmpf),
Precipitation = sum(p01i),
Wind_Speed = max(sknt)) %>%
mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))grid.arrange(
ggplot(weather.Panel, aes(interval60,Precipitation)) + geom_line() +
labs(title="Percipitation", x="Hour", y="Perecipitation") + plotTheme,
ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() +
labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() +
labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
top="Weather Data - Philadelphia (PHL) - May, 2022")study.panel <-
expand.grid(interval60=unique(dat_census$interval60),
start_station = unique(dat_census$start_station)) %>%
left_join(., dat_census %>%
select(start_station, Origin.Tract, start_lon, start_lat )%>%
distinct() %>%
group_by(start_station) %>%
slice(1))
nrow(study.panel)## [1] 132432
ride.panel <-
dat_census %>%
mutate(Trip_Counter = 1) %>%
right_join(study.panel) %>%
group_by(interval60, start_station, Origin.Tract, start_lon, start_lat) %>%
summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
left_join(weather.Panel) %>%
ungroup() %>%
filter(is.na(start_station) == FALSE) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label = TRUE)) %>%
filter(is.na(Origin.Tract) == FALSE)ride.panel <-
left_join(ride.panel, philCensus %>%
as.data.frame() %>%
select(-geometry), by = c("Origin.Tract" = "GEOID"))ride.panel <-
ride.panel %>%
arrange(start_station, interval60) %>%
mutate(lagHour = dplyr::lag(Trip_Count,1),
lag2Hours = dplyr::lag(Trip_Count,2),
lag3Hours = dplyr::lag(Trip_Count,3),
lag4Hours = dplyr::lag(Trip_Count,4),
lag12Hours = dplyr::lag(Trip_Count,12),
lag1day = dplyr::lag(Trip_Count,24),
holiday = ifelse(yday(interval60) == 148,1,0)) %>%
mutate(day = yday(interval60)) %>%
mutate(holidayLag = case_when(dplyr::lag(holiday, 1) == 1 ~ "PlusOneDay",
dplyr::lag(holiday, 2) == 1 ~ "PlustTwoDays",
dplyr::lag(holiday, 3) == 1 ~ "PlustThreeDays",
dplyr::lead(holiday, 1) == 1 ~ "MinusOneDay",
dplyr::lead(holiday, 2) == 1 ~ "MinusTwoDays",
dplyr::lead(holiday, 3) == 1 ~ "MinusThreeDays"),
holidayLag = ifelse(is.na(holidayLag) == TRUE, 0, holidayLag))as.data.frame(ride.panel) %>%
group_by(interval60) %>%
summarise_at(vars(starts_with("lag"), "Trip_Count"), mean, na.rm = TRUE) %>%
gather(Variable, Value, -interval60, -Trip_Count) %>%
mutate(Variable = factor(Variable, levels=c("lagHour","lag2Hours","lag3Hours","lag4Hours",
"lag12Hours","lag1day")))%>%
group_by(Variable) %>%
summarize(correlation = round(cor(Value, Trip_Count),2))## # A tibble: 6 × 2
## Variable correlation
## <fct> <dbl>
## 1 lagHour 0.9
## 2 lag2Hours 0.73
## 3 lag3Hours 0.53
## 4 lag4Hours 0.35
## 5 lag12Hours -0.52
## 6 lag1day 0.8
reg1 <-
lm(Trip_Count ~ factor(hour(interval60)) + factor(dotw) + Temperature, data=ride.Train)
reg2 <-
lm(Trip_Count ~ start_station + factor(dotw)+ Temperature, data=ride.Train)
reg3 <-
lm(Trip_Count ~ start_station + factor(hour(interval60)) + factor(dotw) + Temperature + Precipitation,
data=ride.Train)
reg4 <-
lm(Trip_Count ~ start_station + factor(hour(interval60)) + factor(dotw) + Temperature + Precipitation +
lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day,
data=ride.Train)
reg5 <-
lm(Trip_Count ~ start_station + factor(hour(interval60)) + factor(dotw) + Temperature + Precipitation +
lagHour + lag2Hours +lag3Hours +lag12Hours + lag1day + holidayLag + holiday,
data=ride.Train)week_predictions <-
ride.Test.weekNest %>%
mutate(ATime_FE = map(.x = data, fit = reg1, .f = model_pred),
BSpace_FE = map(.x = data, fit = reg2, .f = model_pred),
CTime_Space_FE = map(.x = data, fit = reg3, .f = model_pred),
DTime_Space_FE_timeLags = map(.x = data, fit = reg4, .f = model_pred),
ETime_Space_FE_timeLags_holidayLags = map(.x = data, fit = reg5, .f = model_pred)) %>%
gather(Regression, Prediction, -data, -week) %>%
mutate(Observed = map(data, pull, Trip_Count),
Absolute_Error = map2(Observed, Prediction, ~ abs(.x - .y)),
MAE = map_dbl(Absolute_Error, mean, na.rm = TRUE),
sd_AE = map_dbl(Absolute_Error, sd, na.rm = TRUE))
week_predictions## # A tibble: 10 × 8
## week data Regression Prediction Observed Absolute_Error MAE sd_AE
## <dbl> <list> <chr> <list> <list> <list> <dbl> <dbl>
## 1 18 <tibble> ATime_FE <dbl> <dbl> <dbl [25,632]> 0.732 0.828
## 2 19 <tibble> ATime_FE <dbl> <dbl> <dbl [29,904]> 0.713 0.827
## 3 18 <tibble> BSpace_FE <dbl> <dbl> <dbl [25,632]> 0.738 0.885
## 4 19 <tibble> BSpace_FE <dbl> <dbl> <dbl [29,904]> 0.699 0.896
## 5 18 <tibble> CTime_Space_FE <dbl> <dbl> <dbl [25,632]> 0.731 0.819
## 6 19 <tibble> CTime_Space_FE <dbl> <dbl> <dbl [29,904]> 0.716 0.816
## 7 18 <tibble> DTime_Space_FE… <dbl> <dbl> <dbl [25,632]> 0.640 0.752
## 8 19 <tibble> DTime_Space_FE… <dbl> <dbl> <dbl [29,904]> 0.609 0.746
## 9 18 <tibble> ETime_Space_FE… <dbl> <dbl> <dbl [25,632]> 0.640 0.752
## 10 19 <tibble> ETime_Space_FE… <dbl> <dbl> <dbl [29,904]> 0.606 0.747
week_predictions %>%
dplyr::select(week, Regression, MAE) %>%
gather(Variable, MAE, -Regression, -week) %>%
ggplot(aes(week, MAE)) +
geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
scale_fill_manual(values = palette5) +
labs(title = "Mean Absolute Errors by model specification and week") +
plotThemeweek_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station)) %>%
dplyr::select(interval60, start_station, Observed, Prediction, Regression) %>%
unnest() %>%
gather(Variable, Value, -Regression, -interval60, -start_station) %>%
group_by(Regression, Variable, interval60) %>%
summarize(Value = sum(Value)) %>%
ggplot(aes(interval60, Value, colour=Variable)) +
geom_line(size = 1.1) +
facet_wrap(~Regression, ncol=1) +
labs(title = "Predicted/Observed bike share time series", subtitle = "Chicago; A test set of 2 weeks", x = "Hour", y= "Station Trips") +
plotThemeweek_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lat = map(data, pull, start_lat),
start_lon = map(data, pull, start_lon)) %>%
select(interval60, start_station, start_lon, start_lat, Observed, Prediction, Regression) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags") %>%
group_by(start_station, start_lon, start_lat) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
geom_sf(data = philCensus, color = "grey", fill = "transparent")+
geom_point(aes(x = start_lon, y = start_lat, color = MAE),
fill = "transparent", alpha = 0.4)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$start_lat), max(dat_census$start_lat))+
xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
labs(title="Mean Abs Error, Test Set, Model 5")+
mapThemeweek_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lat = map(data, pull, start_lat),
start_lon = map(data, pull, start_lon),
dotw = map(data, pull, dotw)) %>%
select(interval60, start_station, start_lon,
start_lat, Observed, Prediction, Regression,
dotw) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
ggplot()+
geom_point(aes(x= Observed, y = Prediction))+
geom_smooth(aes(x= Observed, y= Prediction), method = "lm", se = FALSE, color = "red")+
geom_abline(slope = 1, intercept = 0)+
facet_grid(time_of_day~weekend)+
labs(title="Observed vs Predicted",
x="Observed trips",
y="Predicted trips")+
plotThemeweek_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lat = map(data, pull, start_lat),
start_lon = map(data, pull, start_lon),
dotw = map(data, pull, dotw) ) %>%
select(interval60, start_station, start_lon,
start_lat, Observed, Prediction, Regression,
dotw) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
group_by(start_station, weekend, time_of_day, start_lon, start_lat) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
geom_sf(data = philCensus, color = "grey", fill = "transparent")+
geom_point(aes(x = start_lon, y = start_lat, color = MAE),
fill = "transparent", size = 0.5, alpha = 0.4)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$start_lat), max(dat_census$start_lat))+
xlim(min(dat_census$start_lon), max(dat_census$start_lon))+
facet_grid(weekend~time_of_day)+
labs(title="Mean Absolute Errors, Test Set")+
mapThemeweek_predictions %>%
mutate(interval60 = map(data, pull, interval60),
start_station = map(data, pull, start_station),
start_lat = map(data, pull, start_lat),
start_lon = map(data, pull, start_lon),
dotw = map(data, pull, dotw),
Percent_Taking_Public_Trans = map(data, pull, Percent_Taking_Public_Trans),
Med_Inc = map(data, pull, Med_Inc),
Percent_White = map(data, pull, Percent_White)) %>%
select(interval60, start_station, start_lon,
start_lat, Observed, Prediction, Regression,
dotw, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
filter(time_of_day == "AM Rush") %>%
group_by(start_station, Percent_Taking_Public_Trans, Med_Inc, Percent_White) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
gather(-start_station, -MAE, key = "variable", value = "value")%>%
ggplot(.)+
#geom_sf(data = chicagoCensus, color = "grey", fill = "transparent")+
geom_point(aes(x = value, y = MAE), alpha = 0.4)+
geom_smooth(aes(x = value, y = MAE), method = "lm", se= FALSE)+
facet_wrap(~variable, scales = "free")+
labs(title="Errors as a function of socio-economic variables",
y="Mean Absolute Error (Trips)")+
plotTheme